import pandas as pd
import numpy as np

df = pd.DataFrame()
size = 1000

# 控制年龄段比例
age_ranges = [12, 19, 31, 51, 85]
age_distributions = [0.5, 0.2, 0.1, 0.2]  # 各年龄段占比
age_choices = []

# 生成年龄
for i in range(len(age_distributions)):
    num_samples = int(size * age_distributions[i])
    age_choices.extend(np.random.randint(age_ranges[i], age_ranges[i + 1], size=num_samples))

# 随机打乱年龄样本
np.random.shuffle(age_choices)
 # 确保样本数量正确

# 生成其他列
sex_choices = ['male', 'female']
sex_probs = [0.6, 0.4]

df["user_id"] = np.random.randint(1000000, 3000000, size=size)
df["age"] = age_choices[:size]
df["sex"] = np.random.choice(sex_choices, size=size, p=sex_probs)
df["province_id"] = np.random.randint(1, 34, size=size)
df["times_id"] = np.random.randint(1, 8, size=size)
df["market_id"] = np.random.randint(1, 5, size=size)

df.to_csv("data_users.csv", index=False)